The dataset on police incidents in the City of Seattle includes a collection of events related to 911 emergency calls recorded from 2009 to the present day. The dataset contains approximately 10 million records and has a total size of 5.6GB. It is an open dataset obtained from the official website of the City of Seattle and is available at the following link.
This dataset contains information about calls made to the police and the corresponding police responses in Seattle. The columns in the dataset are as follows:
The dataset provides detailed information on all calls received by the police, including call types, priorities, call and response times, as well as the geographic locations of the incidents. It also includes the initial and final classification of each call, enabling analysis of how incidents are categorized and resolved.
Given the size of the dataset (5.6 GB), the data is loaded and further transformed using Apache Spark tools.
To ensure the correct data types are used, a schema is specified according to which Spark loads the data frames.
schema <- list(
CAD_Event_Number = "character",
Event_Clearance_Description = "character",
Call_Type = "character",
Priority = "character",
Initial_Call_Type = "character",
Final_Call_Type = "character",
Original_Time_Queued = "character",
Arrived_Time = "character",
Precinct = "character",
Sector = "character",
Beat = "character",
Blurred_Longitude = "numeric",
Blurred_Latitude = "numeric"
)
df <- spark_read_csv(sc, name = "police_calls", path = "Call_Data.csv", header = TRUE, columns = schema)
df_clean <- df %>% na.omit()
## * Dropped 3180987 rows with 'na.omit' (10491323 => 7310336)
df_clean <- df_clean %>%
mutate(
Original_Time_Queued = to_timestamp(Original_Time_Queued, "MM/dd/yyyy hh:mm:ss a"),
Arrived_Time = to_timestamp(Arrived_Time, "yyyy MMM dd hh:mm:ss a")
)
df_clean <- df_clean %>%
filter(!is.na(Original_Time_Queued) & !is.na(Arrived_Time) & Original_Time_Queued < Arrived_Time)
rmarkdown::render(“Documentation.Rmd”)
df_clean %>% summarise(
min_original_time = min(Original_Time_Queued, na.rm = TRUE),
max_original_time = max(Original_Time_Queued, na.rm = TRUE),
min_arrived_time = min(Arrived_Time, na.rm = TRUE),
max_arrived_time = max(Arrived_Time, na.rm = TRUE)
) %>% collect()
## # A tibble: 1 × 4
## min_original_time max_original_time min_arrived_time
## <dttm> <dttm> <dttm>
## 1 2009-06-02 03:43:08 2025-10-07 23:58:31 2009-06-02 04:01:55
## # ℹ 1 more variable: max_arrived_time <dttm>
precinct_counts <- df_clean %>%
group_by(Precinct) %>%
summarise(Count = n()) %>%
collect()
ggplot(precinct_counts, aes(x = Precinct, y = Count, fill = Precinct)) +
geom_bar(stat = "identity") +
labs(title = "Distribution by Police Precincts",
x = "Police Precinct",
y = "Number of Occurrences",
fill = "Precinct")
rm(precinct_counts)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1837068 98.2 3613802 193 2404528 128.5
## Vcells 3300211 25.2 8388608 64 4462757 34.1
clearence_counts <- df_clean %>%
group_by(Event_Clearance_Description) %>%
summarise(Count = n()) %>%
collect()
clearence_counts <- clearence_counts %>% filter(Count >= 5000)
ggplot(clearence_counts, aes(x = Event_Clearance_Description, y = Count, fill = Event_Clearance_Description)) +
geom_bar(stat = "identity") +
labs(title = "Distribution by Case Resolution Type",
x = "Resolution",
y = "Number of Occurrences",
fill = "Resolution Type") +
theme(axis.text.x = element_blank())
rm(clearence_counts)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1866764 99.7 3613802 193 3613802 193.0
## Vcells 3369710 25.8 8388608 64 4462757 34.1
priority_counts <- df_clean %>%
group_by(Priority) %>%
summarise(Count = n())
ggplot(priority_counts, aes(x = Priority, y = Count, fill = Priority)) +
geom_bar(stat = "identity") +
labs(title = "Distribution by Priority",
x = "Priority",
y = "Number of Occurrences",
fill = "Priority")
rm(priority_counts)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1861536 99.5 3613802 193 3613802 193.0
## Vcells 3357246 25.7 8388608 64 4462757 34.1
call_type_counts <- df_clean %>%
group_by(Call_Type) %>%
summarise(Count = n())
call_type_counts <- call_type_counts %>% filter(Count > 5000)
ggplot(call_type_counts, aes(x = Call_Type, y = Count, fill = Call_Type)) +
geom_bar(stat = "identity") +
labs(title = "Distribution by Call Method",
x = "Call Method",
y = "Number of Occurrences",
fill = "Call Method") +
theme(axis.text.x = element_blank())
rm(call_type_counts)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1858768 99.3 3613802 193 3613802 193.0
## Vcells 3350102 25.6 8388608 64 4462757 34.1
location_sample <- df_clean %>%
select(Blurred_Longitude, Blurred_Latitude) %>%
sdf_sample(fraction = 0.5, replacement = FALSE) %>%
collect()
ggplot(data = location_sample, aes(x = Blurred_Longitude)) +
geom_histogram(bins = 30, fill = "skyblue", color = "black") +
labs(title = "Distribution of Geographic Latitude",
x = "Geographic Latitude",
y = "Frequency")
rm(location_sample)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1864958 99.6 3613802 193.0 3613802 193.0
## Vcells 9136542 69.8 35132325 268.1 43667894 333.2
location_sample <- df_clean %>%
select(Blurred_Longitude, Blurred_Latitude) %>%
sdf_sample(fraction = 0.5, replacement = FALSE) %>%
collect()
ggplot(data = location_sample, aes(x = Blurred_Latitude)) +
geom_histogram(bins = 30, fill = "skyblue", color = "black") +
labs(title = "Distribution of Geographic Longitude",
x = "Geographic Longitude",
y = "Frequency")
rm(location_sample)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1865168 99.7 3613802 193.0 3613802 193.0
## Vcells 9135717 69.7 35140058 268.1 43754770 333.9
df_clean_ll <- df_clean %>% filter(Blurred_Latitude >= 47 & Blurred_Latitude <= 48 & Blurred_Longitude >= -123 & Blurred_Longitude <= -122)
# Sample 50% of cleaned location data for histogram
location_clean_sample <- df_clean_ll %>%
select(Blurred_Longitude, Blurred_Latitude) %>%
sdf_sample(fraction = 0.5, replacement = FALSE) %>%
collect()
ggplot(data = location_clean_sample, aes(x = Blurred_Longitude)) +
geom_histogram(bins = 30, fill = "skyblue", color = "black") +
labs(title = "Distribution of Cleaned Longitude",
x = "Longitude",
y = "Frequency")
rm(location_clean_sample)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1865625 99.7 3613802 193 3613802 193.0
## Vcells 8891809 67.9 35115460 268 43754770 333.9
# Sample 50% of cleaned location data for histogram
location_clean_sample <- df_clean_ll %>%
select(Blurred_Longitude, Blurred_Latitude) %>%
sdf_sample(fraction = 0.5, replacement = FALSE) %>%
collect()
ggplot(data = location_clean_sample, aes(x = Blurred_Latitude)) +
geom_histogram(bins = 30, fill = "skyblue", color = "black") +
labs(title = "Distribution of Cleaned Latitude",
x = "Latitude",
y = "Frequency")
rm(location_clean_sample)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1865714 99.7 3613802 193.0 3613802 193.0
## Vcells 8888831 67.9 34719224 264.9 43754770 333.9
# Setting the API key value
register_stadiamaps("04c1350e-f51f-4265-b458-ad6b6a3192bb", write = TRUE)
# Creating a map of Seattle
seattle <- c(left = -122.45, bottom = 47.48, right = -122.2, top = 47.73)
seattle_map <- get_stadiamap(seattle, zoom = 18)
# Plotting the map
ggmap(seattle_map) +
geom_point(data = df_clean_ll, aes(x = Blurred_Longitude, y = Blurred_Latitude, color = Final_Call_Type)) +
labs(title = "Map of Seattle City",
x = "Longitude",
y = "Latitude",
color = "Final Call Type")
include_graphics("Images/PointsOnTheMap.png")
df_collected <- df_clean_ll %>%
sdf_sample(fraction = 0.5, replacement = FALSE) %>%
collect()
print(length(unique(df_collected$Initial_Call_Type)))
## [1] 269
print(length(unique(df_collected$Final_Call_Type)))
## [1] 245
df_clean_ll <- df_clean_ll %>%
mutate(Initial_Category = case_when(
grepl("ASLT|Assault|ASSAULT|ASSAULTS|HARRASMENT|THREAT|THREATS|WEAPON|GUN|PANHANDLING|HARASSMENT|VIOLENT", Initial_Call_Type) ~ "Assaults and Threats",
grepl("TRAFFICING|SEX|RAPE|PORNOGRAPHY|PROSTITUTION|LEWD|PROWLER", Initial_Call_Type) ~ "Sex Offenses",
grepl("NARCOTICS|DRUGS|MARIJUANA|OVERDOSE|OD|LIQUOR|DETOX|INTOX|LIQ", Initial_Call_Type) ~ "Narcotics",
grepl("HARBOR|ANIMAL|GAMBLING|WATER|TREES|NORAD|STADIUM|ILLEGAL DUMPING|SLEEPER|HAZ|BIAS|NUISANCE|URINATING|HOSPITAL|PHONE|CROWD|EVENT|DEMONSTRATIONS|DISTURBANCE|UNUSUAL|NOISE|POWER|LANDLINE|LITTERING", Initial_Call_Type) ~ "Civil incidents and security",
grepl("DOA|SHOTS|CASUALTY|FELONY|SUSPICIOUS|ESCAPE|FIRE|PURSUIT|SWAT|SHOOTING|SUICIDE|HOSTAGE|HOMICIDE", Initial_Call_Type) ~ "Emergency and Critical incidents",
grepl("ROBBERY|BURGLARY|PROPERTY|THEFT|BREAKING|SHOPLIFT|ARSON|TRESPASS|BURG|BURN|EXPLOSION|FRAUD", Initial_Call_Type) ~ "Property Crimes",
grepl("ALARM|ORDER|INSPECTION|WATCH", Initial_Call_Type) ~ "Alarm and Security",
grepl("ASSIST|CHECK|HELP|ASSIGNED|PATROL", Initial_Call_Type) ~ "Assistance and Checks",
grepl("DOMESTIC|ABUSE|CUSTODIAL|ARGUMENTS|DV", Initial_Call_Type) ~ "Domestic Violence",
grepl("Traffic|VIOLATIONS|ACCIDENT|MVC|CAR|DUI|TRAF|ROAD|VEHICLE|DUI|ACC|HIT AND RUN|", Initial_Call_Type) ~ "Traffic Incident",
grepl("MISSING|AWOL|FOUND|RUNAWAY|ABDUCTION|KIDNAP|CHILD|JUVENILE|LOST|AMBER|A.W.O.L.", Initial_Call_Type) ~ "Missing Persons",
grepl("OBS", Initial_Call_Type) ~ "Observation",
grepl("CANCELLED|NO ANSWER|OUT AT RANGE", Initial_Call_Type) ~ "No action",
TRUE ~ "Other"
))
df_clean_ll <- df_clean_ll %>%
mutate(Final_Category = case_when(
grepl("ASLT|Assault|ASSAULT|ASSAULTS|HARRASMENT|THREAT|THREATS|WEAPON|GUN|PANHANDLING|HARASSMENT|VIOLENT", Final_Call_Type) ~ "Assaults and Threats",
grepl("TRAFFICING|SEX|RAPE|PORNOGRAPHY|PROSTITUTION|LEWD|PROWLER", Final_Call_Type) ~ "Sex Offenses",
grepl("NARCOTICS|DRUGS|MARIJUANA|OVERDOSE|OD|LIQUOR|DETOX|INTOX|LIQ", Final_Call_Type) ~ "Narcotics",
grepl("HARBOR|ANIMAL|GAMBLING|WATER|TREES|NORAD|STADIUM|ILLEGAL DUMPING|SLEEPER|HAZ|BIAS|NUISANCE|URINATING|HOSPITAL|PHONE|CROWD|EVENT|DEMONSTRATIONS|DISTURBANCE|UNUSUAL|NOISE|POWER|LANDLINE|LITTERING", Final_Call_Type) ~ "Civil incidents and security",
grepl("DOA|SHOTS|CASUALTY|FELONY|SUSPICIOUS|ESCAPE|FIRE|PURSUIT|SWAT|SHOOTING|SUICIDE|HOSTAGE|HOMICIDE", Final_Call_Type) ~ "Emergency and Critical incidents",
grepl("ROBBERY|BURGLARY|PROPERTY|THEFT|BREAKING|SHOPLIFT|ARSON|TRESPASS|BURG|BURN|EXPLOSION|FRAUD", Final_Call_Type) ~ "Property Crimes",
grepl("ALARM|ORDER|INSPECTION|WATCH", Final_Call_Type) ~ "Alarm and Security",
grepl("ASSIST|CHECK|HELP|ASSIGNED|PATROL", Final_Call_Type) ~ "Assistance and Checks",
grepl("DOMESTIC|ABUSE|CUSTODIAL|ARGUMENTS|DV", Final_Call_Type) ~ "Domestic Violence",
grepl("Traffic|VIOLATIONS|ACCIDENT|MVC|CAR|DUI|TRAF|ROAD|VEHICLE|DUI|ACC|HIT AND RUN|", Final_Call_Type) ~ "Traffic Incident",
grepl("MISSING|AWOL|FOUND|RUNAWAY|ABDUCTION|KIDNAP|CHILD|JUVENILE|LOST|AMBER|A.W.O.L.", Final_Call_Type) ~ "Missing Persons",
grepl("OBS", Final_Call_Type) ~ "Observation",
grepl("CANCELLED|NO ANSWER|OUT AT RANGE", Final_Call_Type) ~ "No action",
TRUE ~ "Other"
))
same_diff_counts <- df_clean_ll %>%
group_by(Same_Category = ifelse(Initial_Category == Final_Category, "Same", "Different")) %>%
summarise(Count = n())
# Bar chart for displaying values
ggplot(same_diff_counts, aes(x = Same_Category, y = Count, fill = Same_Category)) +
geom_bar(stat = "identity") +
labs(title = "Comparison of Initial and Final Categories",
x = "Category",
y = "Number of Occurrences",
fill = "Category")
rm(same_diff_counts)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3799306 203.0 7700091 411.3 7700091 411.3
## Vcells 44152721 336.9 129218969 985.9 161092184 1229.1
initial_category_counts <- df_clean_ll %>%
group_by(Initial_Category) %>%
summarise(Count = n())
ggplot(initial_category_counts, aes(x = reorder(Initial_Category, -Count), y = Count, fill = Initial_Category)) +
geom_bar(stat = "identity") +
labs(title = "Distribution of Initial Categorizations",
x = "Initial Category",
y = "Number of Occurrences") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
rm(initial_category_counts)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3805829 203.3 7700091 411.3 7700091 411.3
## Vcells 44168157 337.0 129218969 985.9 161092184 1229.1
final_category_counts <- df_clean_ll %>%
group_by(Final_Category) %>%
summarise(Count = n())
ggplot(final_category_counts, aes(x = reorder(Final_Category, -Count), y = Count, fill = Final_Category)) +
geom_bar(stat = "identity") +
labs(title = "Distribution of Final Categorizations",
x = "Final Category",
y = "Number of Occurrences") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
rm(final_category_counts)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3805120 203.3 7700091 411.3 7700091 411.3
## Vcells 44166515 337.0 129218969 985.9 161092184 1229.1
location_final_sample <- df_clean_ll %>%
select(Blurred_Longitude, Blurred_Latitude, Final_Category) %>%
sdf_sample(fraction = 0.5, replacement = FALSE) %>% # Sample 50% of data
collect()
ggplot(location_final_sample, aes(x = Blurred_Longitude, y = Blurred_Latitude, color = Final_Category)) +
geom_point(alpha = 0.6) +
labs(title = "Visualization of the Relationship Between Final Category and Call Location",
x = "Longitude",
y = "Latitude",
color = "Final Category") +
theme_minimal() +
theme(legend.position = "bottom")
rm(location_final_sample)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3820454 204.1 7700091 411.3 7700091 411.3
## Vcells 88402502 674.5 157530106 1201.9 161092184 1229.1
location_final_sample <- df_clean_ll %>%
select(Blurred_Longitude, Blurred_Latitude, Sector) %>%
sdf_sample(fraction = 0.5, replacement = FALSE) %>% # Sample 50% of data
collect()
ggplot(location_final_sample, aes(x = Blurred_Longitude, y = Blurred_Latitude, color = Sector)) +
geom_point(alpha = 0.6) +
labs(title = "Visualization of the Relationship Between Sector and Call Location",
x = "Latitude",
y = "Longitude",
color = "Sector") +
theme_minimal() +
theme(legend.position = "bottom")
rm(location_final_sample)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3826425 204.4 7700091 411.3 7700091 411.3
## Vcells 88439457 674.8 192839609 1471.3 192839609 1471.3
location_final_sample <- df_clean_ll %>%
select(Blurred_Longitude, Blurred_Latitude, Precinct) %>%
sdf_sample(fraction = 0.5, replacement = FALSE) %>% # Sample 1% of data
collect()
ggplot(location_final_sample, aes(x = Blurred_Longitude, y = Blurred_Latitude, color = Precinct)) +
geom_point(alpha = 0.6) +
labs(title = "Visualization of the Relationship Between Police Precinct and Call Location",
x = "Latitude",
y = "Longitude",
color = "Police Precinct") +
theme_minimal() +
theme(legend.position = "bottom")
rm(location_final_sample)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3818818 204.0 7700091 411.3 7700091 411.3
## Vcells 88404806 674.5 193566819 1476.8 193566819 1476.8
Arrival_Time feature to indicate whether the response was
fast or not, and then perform binary classification. To determine what
constitutes a fast response versus a slow one, a histogram of
five-minute intervals will first be presented to examine the
distribution of occurrences.df_times <- df_clean_ll %>%
mutate(
Response_Time = (unix_timestamp(Arrived_Time) - unix_timestamp(Original_Time_Queued)) / 60
)
# Plot histogram with 5-minute bins
df_times_filtered <- df_times %>% filter(Response_Time >= 0 & Response_Time <= 1000)
response_time_sample <- df_times_filtered %>%
select(Response_Time) %>%
sdf_sample(fraction = 0.1, replacement = FALSE) %>%
collect()
ggplot(response_time_sample, aes(x = Response_Time)) +
geom_histogram(binwidth = 10, fill = "skyblue", color = "black") +
labs(title = "Distribution of Response Time",
x = "Response Time (minutes)",
y = "Frequency")
rm(response_time_sample)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3800570 203.0 7700091 411.3 7700091 411.3
## Vcells 44708985 341.2 154853456 1181.5 193566819 1476.8
Response_Speed was created with values 1 and 0. The values
of this feature are assigned based on the median response time -
Response_Time. All instances with a response time below the
median are assigned a Response_Speed value of 1,
representing a fast response, while the remaining instances are assigned
a value of 0, representing a slow response. The following code
demonstrates how this feature is created in the dataset:# Calculate median in Spark without collecting all data
median_result <- df_times_filtered %>%
summarise(median_time = percentile_approx(Response_Time, 0.5)) %>%
collect()
median_value <- median_result$median_time
# Create Response_Speed label
df_prepared <- df_times_filtered %>%
mutate(Response_Speed = if_else(Response_Time <= median_value, 1, 0))
response_speeds <- df_prepared %>%
group_by(Response_Speed) %>%
summarise(Count = n()) %>%
collect()
ggplot(response_speeds, aes(x = Response_Speed, y = Count, fill = Response_Speed)) +
geom_bar(stat = "identity") +
labs(title = "Distribution of Response Speed",
x = "Response Speed",
y = "Number of Occurrences",
fill = "Response Speed")
rm(response_speeds)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3814342 203.8 7700091 411.3 7700091 411.3
## Vcells 44178837 337.1 123882765 945.2 193566819 1476.8
viz_sample <- df_prepared %>%
select(Priority, Initial_Category, Blurred_Longitude, Blurred_Latitude,
Sector, Response_Time, Response_Speed) %>%
sdf_sample(fraction = 0.5, replacement = FALSE) %>%
collect()
ggplot(viz_sample, aes(x = Priority, y = Response_Time)) +
geom_boxplot() +
labs(title = "Response Time vs. Priority", x = "Priority", y = "Response Time")
ggplot(viz_sample, aes(x = Initial_Category, y = Response_Time)) +
geom_boxplot() +
labs(title = "Response Time vs. Initial Category", x = "Initial Category", y = "Response Time") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(viz_sample, aes(x = Blurred_Longitude, y = Blurred_Latitude, color = Response_Speed)) +
geom_point() +
labs(title = "Response Speed vs. Location", x = "Longitude", y = "Latitude", color = "Response Time")
ggplot(viz_sample, aes(x = Sector, y = Response_Time)) +
geom_boxplot() +
labs(title = "Response Time vs. Sector", x = "Sector", y = "Response Time") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
rm(viz_sample)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3881460 207.3 7700091 411.3 7700091 411.3
## Vcells 129934535 991.4 222919322 1700.8 222919322 1700.8
Domestic Violence and Emergency and
Critical Incidents, also result in a quick response.Response_Time, Sector, Precinct,
Initial_Category, Longitude, and
Latitude.df_prepared_split <- df_prepared %>%
select(Response_Time, Response_Speed, Sector, Precinct, Initial_Category, Blurred_Longitude, Blurred_Latitude, Priority) %>%
sdf_random_split(training = 0.8,
test = 0.2,
seed = 100)
train_data <- df_prepared_split$training
test_data <- df_prepared_split$test
# Logistic Regression
log_pipeline <- sc %>%
ml_pipeline() %>%
ft_r_formula(Response_Speed ~ .) %>%
ml_logistic_regression()
param_grid <- list(
logistic_regression = list(reg_param = c(0.01, 0.1, 1))
)
lr_evaluator <- ml_binary_classification_evaluator(x = sc, metricName = "areaUnderROC")
# Cross-validation
logistic_cv <- ml_cross_validator(
x = sc,
estimator = log_pipeline,
estimator_param_maps = param_grid,
evaluator = lr_evaluator,
num_folds = 5
)
print(logistic_cv)
## CrossValidator (Estimator)
## <cross_validator__7cbe1cdc_68b9_4738_bc49_466244bf60a6>
## (Parameters -- Tuning)
## estimator: Pipeline
## <pipeline__b925e8d9_6208_4186_ab5b_ebd7fc469ef0>
## evaluator: BinaryClassificationEvaluator
## <binary_classification_evaluator__2fefe4a9_4876_426e_a251_684934a53010>
## with metric areaUnderROC
## num_folds: 5
## [Tuned over 3 hyperparameter sets]
model_cv <- ml_fit(
x = logistic_cv,
dataset = df_prepared_split$training
)
cv_metrics <- ml_validation_metrics(model_cv)
print(cv_metrics)
## areaUnderROC reg_param_1
## 1 0.9367413 0.01
## 2 0.8739719 0.10
## 3 0.8339178 1.00
cv_metrics %>%
ggplot(aes(reg_param_1, areaUnderROC)) +
geom_line() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 0.00505
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 0.09495
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 0.81893
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : span too small. fewer
## data values than degrees of freedom.
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## 0.00505
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 0.09495
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 0
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 0.81893
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
# Based on the graph, it is determined that the model with a regularization parameter of 0.01 is the best
lmodel <- ml_logistic_regression(
df_prepared_split$training,
Response_Speed ~ .,
reg_param = 0.01
)
lrmx <- lmodel %>%
ml_predict(df_prepared_split$test) %>%
ml_metrics_binary()
# Random Forest
rf_pipeline <- sc %>%
ml_pipeline() %>%
ft_r_formula(Response_Speed ~ .) %>%
ml_random_forest_classifier()
rf_grid <- list(
random_forest_classifier = list(
num_trees = c(3, 25, 50)
)
)
rf_evaluator = ml_binary_classification_evaluator(x = sc, metricName = "areaUnderROC")
# Cross-validation
rf_cv <- ml_cross_validator(
x = sc,
estimator = rf_pipeline,
estimator_param_maps = rf_grid,
evaluator = rf_evaluator,
num_folds = 5
)
model_rf_cv <- ml_fit(
x = rf_cv,
dataset = df_prepared_split$training
)
rf_cv_metrics <- ml_validation_metrics(model_rf_cv)
print(rf_cv_metrics)
## areaUnderROC num_trees_1
## 1 0.9717074 3
## 2 0.9939046 25
## 3 0.9944267 50
rf_cv_metrics %>%
ggplot(aes(num_trees_1, areaUnderROC)) +
geom_line() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 2.765
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 22.235
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 636.81
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : span too small. fewer
## data values than degrees of freedom.
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## 2.765
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 22.235
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 0
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 636.81
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
# Selection of the best model
rfmodel <- ml_random_forest_classifier(
df_prepared_split$training,
Response_Speed ~ .,
num_trees = 50
)
rfmx <- rfmodel %>%
ml_predict(df_prepared_split$test) %>%
ml_metrics_binary()
dt_pipeline <- sc %>%
ml_pipeline() %>%
ft_r_formula(Response_Speed ~ .) %>%
ml_decision_tree_classifier()
dt_grid <- list(
decision_tree_classifier = list(
max_depth = c(3, 5, 10)
)
)
dt_evaluator <- ml_binary_classification_evaluator(x = sc, metricName = "areaUnderROC")
# Cross-validation
dt_cv <- ml_cross_validator(
x = sc,
estimator = dt_pipeline,
estimator_param_maps = dt_grid,
evaluator = dt_evaluator,
num_folds = 5
)
model_dt_cv <- ml_fit(
x = dt_cv,
dataset = df_prepared_split$training
)
dt_cv_metrics <- ml_validation_metrics(model_dt_cv)
print(dt_cv_metrics)
## areaUnderROC max_depth_1
## 1 0.9904559 3
## 2 0.9994886 5
## 3 0.9994836 10
dt_cv_metrics %>%
ggplot(aes(max_depth_1, areaUnderROC)) +
geom_line() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 2.965
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 2.035
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 25.351
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : span too small. fewer
## data values than degrees of freedom.
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## 2.965
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 2.035
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 0
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 25.351
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
# Selection of the most suitable model
dtmodel <- ml_decision_tree_classifier(
df_prepared_split$training,
Response_Speed ~ .,
max_depth = 5
)
dtmx <- dtmodel %>%
ml_predict(df_prepared_split$test) %>%
ml_metrics_binary()
metrics_df <- data.frame(
model = c("Logistic Regression", "Random Forest", "Decision Tree"),
auc_roc = NA,
pr_auc = NA
)
# Fill in the metrics for each model
metrics_df[1, "auc_roc"] <- lrmx$.estimate[1]
metrics_df[1, "pr_auc"] <- lrmx$.estimate[2]
metrics_df[2, "auc_roc"] <- rfmx$.estimate[1]
metrics_df[2, "pr_auc"] <- rfmx$.estimate[2]
metrics_df[3, "auc_roc"] <- dtmx$.estimate[1]
metrics_df[3, "pr_auc"] <- dtmx$.estimate[2]
print(metrics_df)
## model auc_roc pr_auc
## 1 Logistic Regression 0.9366466 0.9257672
## 2 Random Forest 0.9942390 0.9962228
## 3 Decision Tree 0.9994909 0.9994986
ggplot(metrics_df, aes(x = model, y = auc_roc, fill = model)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "ROC-AUC Comparison", y = "ROC-AUC") +
theme_minimal()
ggplot(metrics_df, aes(x = model, y = pr_auc, fill = model)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "PR-AUC Comparison", y = "PR-AUC") +
theme_minimal()
dataset.clustering <- df_times %>%
select(Blurred_Longitude, Blurred_Latitude, Final_Category, Precinct)
# Function that calculates the within-cluster sum of squares
calculate_wcss <- function(data, max_k) {
wcss <- numeric(max_k)
for (kc in 2:max_k) {
model <- ml_bisecting_kmeans(data, ~Blurred_Longitude + Blurred_Latitude, k = kc, max_iter = 10)
wcss[kc] <- sum(model$cost)
}
return(wcss)
}
max_k <- 20
wcss_values <- calculate_wcss(dataset.clustering, max_k)
wcss_df <- data.frame(
k = 1:max_k,
WCSS = wcss_values
)
ggplot(wcss_df, aes(x = k, y = WCSS)) +
geom_line(color = "blue") +
geom_point(color = "red") +
labs(title = "Elbow Method for Determining the Optimal Number of Clusters",
x = "Number of Clusters (k)",
y = "Within-Cluster Sum of Squares (WCSS)") +
theme_minimal() +
theme(text = element_text(size = 16))
model.k7 <- ml_bisecting_kmeans(dataset.clustering, ~Blurred_Longitude + Blurred_Latitude, k = 7, seed = 1, max_iter = 10)
clusters.k7 <- ml_predict(model.k7, dataset.clustering)
model.k10 <- ml_bisecting_kmeans(dataset.clustering, ~Blurred_Longitude + Blurred_Latitude, k = 10, seed = 1, max_iter = 10)
clusters.k10 <- ml_predict(model.k10, dataset.clustering)
variance_within_clusters <- function(data) {
data %>%
group_by(prediction) %>%
summarise(across(c(Blurred_Longitude, Blurred_Latitude), var))
}
cluster_summary <- function(data) {
data %>%
group_by(prediction) %>%
summarise(across(c(Blurred_Longitude, Blurred_Latitude), list(mean = mean, sd = sd, median = median)))
}
centers.k7 <- model.k7$centers
sizes.k7 <- table(dplyr::pull(clusters.k7, prediction))
variance.k7 <- variance_within_clusters(clusters.k7)
summary.k7 <- cluster_summary(clusters.k7)
print(centers.k7)
## Blurred_Longitude Blurred_Latitude
## 1 -122.3631 47.54180
## 2 -122.2873 47.54420
## 3 -122.3444 47.60907
## 4 -122.3154 47.60659
## 5 -122.3654 47.66635
## 6 -122.3076 47.66828
## 7 -122.3266 47.71304
print(sizes.k7)
##
## 0 1 2 3 4 5 6
## 554795 671762 1174157 1242294 759705 500271 623915
print(variance.k7)
## Warning: Missing values are always removed in SQL aggregation functions.
## Use `na.rm = TRUE` to silence this warning
## This warning is displayed once every 8 hours.
## # Source: SQL [?? x 3]
## # Database: spark_connection
## prediction Blurred_Longitude Blurred_Latitude
## <int> <dbl> <dbl>
## 1 4 0.000378 0.000378
## 2 1 0.000354 0.000399
## 3 5 0.000261 0.000142
## 4 6 0.000448 0.000162
## 5 3 0.000131 0.000170
## 6 0 0.000384 0.000313
## 7 2 0.000278 0.000169
print(summary.k7)
## # Source: SQL [?? x 7]
## # Database: spark_connection
## prediction Blurred_Longitude_mean Blurred_Longitude_sd Blurred_Longitude_med…¹
## <int> <dbl> <dbl> <dbl>
## 1 4 -122. 0.0194 -122.
## 2 1 -122. 0.0188 -122.
## 3 5 -122. 0.0162 -122.
## 4 6 -122. 0.0212 -122.
## 5 3 -122. 0.0114 -122.
## 6 0 -122. 0.0196 -122.
## 7 2 -122. 0.0167 -122.
## # ℹ abbreviated name: ¹Blurred_Longitude_median
## # ℹ 3 more variables: Blurred_Latitude_mean <dbl>, Blurred_Latitude_sd <dbl>,
## # Blurred_Latitude_median <dbl>
centers.k10 <- model.k10$centers
sizes.k10 <- table(dplyr::pull(clusters.k10, prediction))
variance.k10 <- variance_within_clusters(clusters.k10)
summary.k10 <- cluster_summary(clusters.k10)
print(centers.k10)
## Blurred_Longitude Blurred_Latitude
## 1 -122.3631 47.54180
## 2 -122.2873 47.54420
## 3 -122.3909 47.57914
## 4 -122.3400 47.61188
## 5 -122.3151 47.59521
## 6 -122.3156 47.61687
## 7 -122.3770 47.67496
## 8 -122.3494 47.65050
## 9 -122.3076 47.66828
## 10 -122.3266 47.71304
print(sizes.k10)
##
## 0 1 2 3 4 5 6 7 8 9
## 554795 671762 100518 1073639 589620 652674 453997 305708 500271 623915
print(variance.k10)
## # Source: SQL [?? x 3]
## # Database: spark_connection
## prediction Blurred_Longitude Blurred_Latitude
## <int> <dbl> <dbl>
## 1 4 0.000134 0.0000646
## 2 8 0.000261 0.000142
## 3 1 0.000354 0.000399
## 4 5 0.000128 0.0000422
## 5 7 0.000180 0.000137
## 6 6 0.000210 0.000293
## 7 9 0.000448 0.000162
## 8 3 0.0000653 0.0000893
## 9 0 0.000384 0.000313
## 10 2 0.000177 0.0000405
print(summary.k10)
## # Source: SQL [?? x 7]
## # Database: spark_connection
## prediction Blurred_Longitude_mean Blurred_Longitude_sd Blurred_Longitude_me…¹
## <int> <dbl> <dbl> <dbl>
## 1 4 -122. 0.0116 -122.
## 2 8 -122. 0.0162 -122.
## 3 1 -122. 0.0188 -122.
## 4 5 -122. 0.0113 -122.
## 5 7 -122. 0.0134 -122.
## 6 6 -122. 0.0145 -122.
## 7 9 -122. 0.0212 -122.
## 8 3 -122. 0.00808 -122.
## 9 0 -122. 0.0196 -122.
## 10 2 -122. 0.0133 -122.
## # ℹ abbreviated name: ¹Blurred_Longitude_median
## # ℹ 3 more variables: Blurred_Latitude_mean <dbl>, Blurred_Latitude_sd <dbl>,
## # Blurred_Latitude_median <dbl>
plot_clusters <- function(data, centers, title) {
ggplot(data, aes(x = Blurred_Longitude, y = Blurred_Latitude, color = as.factor(prediction))) +
geom_point(size = 2) +
geom_point(data = centers, aes(x = Blurred_Longitude, y = Blurred_Latitude), color = "black", size = 3, shape = 4) +
labs(color = "Cluster", title = title, x = "Longitude", y = "Latitude") +
theme_minimal() +
theme(text = element_text(size = 16))
}
sample_data.k7 <- clusters.k7 %>% sample_frac(0.01)
sample_data.k10 <- clusters.k10 %>% sample_frac(0.01)
heatmap3 <- plot_clusters(collect(sample_data.k7), centers.k7, "Density Heatmap for K=7")
heatmap4 <- plot_clusters(collect(sample_data.k10), centers.k10, "Density Heatmap for K=10")
ggsave("heatmap3.jpg", heatmap3)
## Saving 7 x 5 in image
ggsave("heatmap4.jpg", heatmap4)
## Saving 7 x 5 in image
include_graphics(c("heatmap3.jpg", "heatmap4.jpg"))
ggplot(data = collect(sample_data.k7), aes(x = as.factor(prediction))) +
geom_bar(aes(fill = Final_Category), position = "fill") +
labs(x = "Cluster", y = "Proportion", title = "Distribution of Final Categories within Clusters") +
scale_fill_brewer(palette = "Set3") +
theme_minimal() +
theme(text = element_text(size = 16))
# Precinct and Cluster
ggplot(data = collect(sample_data.k7), aes(x = as.factor(prediction))) +
geom_bar(aes(fill = Precinct), position = "fill") +
labs(x = "Cluster", y = "Proportion", title = "Distribution of Police Departments within Clusters") +
scale_fill_brewer(palette = "Set3") +
theme_minimal() +
theme(text = element_text(size = 16))
ggplot(data = collect(sample_data.k10), aes(x = as.factor(prediction))) +
geom_bar(aes(fill = Final_Category), position = "fill") +
labs(x = "Cluster", y = "Proportion", title = "Distribution of Final Categories within Clusters") +
scale_fill_brewer(palette = "Set3") +
theme_minimal() +
theme(text = element_text(size = 16))
# Precinct and Cluster
ggplot(data = collect(sample_data.k10), aes(x = as.factor(prediction))) +
geom_bar(aes(fill = Precinct), position = "fill") +
labs(x = "Cluster", y = "Proportion", title = "Distribution of Police Departments within Clusters") +
scale_fill_brewer(palette = "Set3") +
theme_minimal() +
theme(text = element_text(size = 16))